Set coding environment

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rm(list = ls()) # clean environment
cat("\014") # clean console

Load libraries (install.packages if not found)

library(datasets)
library(caret)
library(GGally)
library(viridis)
library(plotly)
library(cluster)

Import dataset and explore

dim(iris)
## [1] 150   5
sapply(iris, function(x) length(unique(x)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           35           23           43           22            3
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
df = iris

Preprocessing

norm.values = preProcess(iris[,-5], method = c("center","scale"))
iris.norm = predict(norm.values,iris[,-5])
head(iris.norm)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1   -0.8976739  1.01560199    -1.335752   -1.311052
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052
## 3   -1.3807271  0.32731751    -1.392399   -1.311052
## 4   -1.5014904  0.09788935    -1.279104   -1.311052
## 5   -1.0184372  1.24503015    -1.335752   -1.311052
## 6   -0.5353840  1.93331463    -1.165809   -1.048667

Clustering

#best cluster based on WSSE

k = seq(1,15,1)
metrics.df <- data.frame(k, wsse = rep(0, length(k)))

for (i in 1:length(k)){
  cluster_i = kmeans(iris.norm, centers = i, nstart = 25)
  metrics.df[i, 2] = cluster_i$tot.withinss 
}

plot_kmeans = ggplot(metrics.df, aes(x=k, y=wsse)) + geom_line(linetype = 3) + geom_point(shape=1) + ylab("Total Within Sum of Squares") + scale_x_continuous(breaks = k) + theme_classic()
plot_kmeans

set.seed(1)
k_clusters = kmeans(iris.norm, 3, nstart = 25)

k_clusters$size
## [1] 53 47 50
df$clusters = as.factor(k_clusters$cluster)

ggparcoord(df[,-5], columns = 1:4, groupColumn = 5, order = "anyClass", scale="globalminmax", showPoints = T, title = "Parallel Plot Iris dataset with k-clusters", alphaLines = 0.3) + scale_color_viridis(discrete = T, option = "E")  + theme(plot.title = element_text(size=10)) + coord_flip() + theme_bw()

centroids = data.frame(k_clusters$centers)

scale_back_z_score <- function(z,var) {
  sd_train = sd(df[,var])
  mean_train = mean(df[,var])
  x = sapply(z, function(x_) x_*sd_train + mean_train)
  return(x)
}

centroids$Sepal.Length = scale_back_z_score(centroids$Sepal.Length, "Sepal.Length")
centroids$Sepal.Width = scale_back_z_score(centroids$Sepal.Width, "Sepal.Width")
centroids$Petal.Length = scale_back_z_score(centroids$Petal.Length, "Petal.Length")
centroids$Petal.Width = scale_back_z_score(centroids$Petal.Width, "Petal.Width")

p <- plot_ly(df[,-c(1,5)], x = ~Petal.Length, z = ~Petal.Width, y = ~Sepal.Width)
fig <- p %>% add_markers(color = ~clusters, size = 10) %>% add_mesh(opacity=0.1, data=df[df$clusters==2,], alphahull =0) %>% add_mesh(opacity=0.1, data=df[df$clusters==3,], alphahull = 0) %>% add_mesh(opacity=0.1, data=df[df$clusters==1,], alphahull = 0) %>% add_trace(data = centroids, type = "scatter3d", x = ~Petal.Length, z = ~Petal.Width, y = ~Sepal.Width, mode = "markers", marker = list(size = 5, color = "black", symbol="cross"), name="centroids")
fig
#check consistency
sil = silhouette(k_clusters$cluster, dist(df[,-c(5,6)]))
plot(sil, col=1:3, border=NA)